{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import pdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_split = 'train'\n",
    "path = os.path.join('./data/', data_split)\n",
    "file_name = os.path.join(path, 'news.csv')\n",
    "train_frame = pd.read_csv(file_name) \n",
    "train_frame['Report Content'] = train_frame['Report Content'].apply(lambda x: x.split('##'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of training data is 10587, the fake news is 2743\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of training data is %d, the fake news is %d\" %(train_frame.shape[0], train_frame[train_frame['label'] == 1].shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ofiicial Account Name</th>\n",
       "      <th>Title</th>\n",
       "      <th>News Url</th>\n",
       "      <th>Image Url</th>\n",
       "      <th>Report Content</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>环球人物</td>\n",
       "      <td>中国反腐风刮到阿根廷，这个美到让人瘫痪的女总统，因为8个本子摊上大事了</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTAzNDI4MDc2MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/hpcO6kWnPm6cX3M...</td>\n",
       "      <td>[内容不符]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>西湖之声</td>\n",
       "      <td>腾讯为《如懿传》道歉？这部3亿大剧上映第一天遭网友狂吐槽：愣是拍成村头恋曲...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA2Mjk0MTE2MA...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/vQCGoQzHAbaAXRr...</td>\n",
       "      <td>[满口胡言]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>厦门晚报</td>\n",
       "      <td>顺风车司机奸杀20岁女乘客，落网视频曝光！滴滴道歉…</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA3NzI1Mzg4MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/TxqQX9BtmOMpwDZ...</td>\n",
       "      <td>[？ ]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>腾讯娱乐</td>\n",
       "      <td>偶遇鹿晗关晓彤旅行过七夕，小情侣是真滴甜...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhfkIZ3...</td>\n",
       "      <td>[领个屁证，过你妹的七夕，几天前的图在今天拿来博眼球]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>腾讯娱乐</td>\n",
       "      <td>赵丽颖和冯绍峰即将公布恋情？网友：曝不曝没区别啊</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhdTkXb...</td>\n",
       "      <td>[事件不实。]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Ofiicial Account Name                                     Title  \\\n",
       "0                  环球人物       中国反腐风刮到阿根廷，这个美到让人瘫痪的女总统，因为8个本子摊上大事了   \n",
       "1                  西湖之声  腾讯为《如懿传》道歉？这部3亿大剧上映第一天遭网友狂吐槽：愣是拍成村头恋曲...   \n",
       "2                  厦门晚报                顺风车司机奸杀20岁女乘客，落网视频曝光！滴滴道歉…   \n",
       "3                  腾讯娱乐                   偶遇鹿晗关晓彤旅行过七夕，小情侣是真滴甜...   \n",
       "4                  腾讯娱乐                  赵丽颖和冯绍峰即将公布恋情？网友：曝不曝没区别啊   \n",
       "\n",
       "                                            News Url  \\\n",
       "0  http://mp.weixin.qq.com/s?__biz=MTAzNDI4MDc2MQ...   \n",
       "1  http://mp.weixin.qq.com/s?__biz=MTA2Mjk0MTE2MA...   \n",
       "2  http://mp.weixin.qq.com/s?__biz=MTA3NzI1Mzg4MQ...   \n",
       "3  http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...   \n",
       "4  http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...   \n",
       "\n",
       "                                           Image Url  \\\n",
       "0  http://mmbiz.qpic.cn/mmbiz_jpg/hpcO6kWnPm6cX3M...   \n",
       "1  http://mmbiz.qpic.cn/mmbiz_jpg/vQCGoQzHAbaAXRr...   \n",
       "2  http://mmbiz.qpic.cn/mmbiz_jpg/TxqQX9BtmOMpwDZ...   \n",
       "3  http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhfkIZ3...   \n",
       "4  http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhdTkXb...   \n",
       "\n",
       "                Report Content  label  \n",
       "0                       [内容不符]      0  \n",
       "1                       [满口胡言]      0  \n",
       "2                         [？ ]      0  \n",
       "3  [领个屁证，过你妹的七夕，几天前的图在今天拿来博眼球]      0  \n",
       "4                      [事件不实。]      0  "
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_frame.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_split = 'test'\n",
    "path = os.path.join('./data/', data_split)\n",
    "file_name = os.path.join(path, 'news.csv')\n",
    "test_frame = pd.read_csv(file_name) \n",
    "test_frame['Report Content'] = test_frame['Report Content'].apply(lambda x: x.split('##'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of test data is 10141, the fake news is 1482\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of test data is %d, the fake news is %d\" %(test_frame.shape[0], test_frame[test_frame['label'] == 1].shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ofiicial Account Name</th>\n",
       "      <th>Title</th>\n",
       "      <th>News Url</th>\n",
       "      <th>Image Url</th>\n",
       "      <th>Report Content</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>私家车第一广播</td>\n",
       "      <td>国务院宣布：生孩子有补助了！明年1月起实施，浙江属于这档！</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA1NTc0MjE0MA...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/j27ttKHs7TlFAL5...</td>\n",
       "      <td>[国务院没有发布过类似信息]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>杭州交通918</td>\n",
       "      <td>4个年轻帅小伙突然人没了, 身亡真相惊呆所有人! 太可惜了</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA5Mzc3MDQyMA...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/0y9ibmULDTbDuCt...</td>\n",
       "      <td>[？？？？]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>腾讯娱乐</td>\n",
       "      <td>迪丽热巴时装周走秀气场一米八，病态妆容也挡不住她的高级感</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhfdGHM...</td>\n",
       "      <td>[那个泰国人不是模特]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>腾讯娱乐</td>\n",
       "      <td>李晨北京四合院内景曝光，还和妈妈一起吃饺子画面hin温馨</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1Nxhd8SmK...</td>\n",
       "      <td>[造谣生事]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>央视新闻</td>\n",
       "      <td>唾液测天赋、饭后剧烈运动得阑尾炎...8月“科学”流言 你中招了吗？</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTI0MDU3NDYwMQ...</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/oq1PymRl9D7ZOQU...</td>\n",
       "      <td>[唾液基因检测的确可以找出孩子的优势潜能，明确孩子的培养方向，科学正确引导孩子发展长处，助孩...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Ofiicial Account Name                               Title  \\\n",
       "0               私家车第一广播       国务院宣布：生孩子有补助了！明年1月起实施，浙江属于这档！   \n",
       "1               杭州交通918       4个年轻帅小伙突然人没了, 身亡真相惊呆所有人! 太可惜了   \n",
       "2                  腾讯娱乐        迪丽热巴时装周走秀气场一米八，病态妆容也挡不住她的高级感   \n",
       "3                  腾讯娱乐        李晨北京四合院内景曝光，还和妈妈一起吃饺子画面hin温馨   \n",
       "4                  央视新闻  唾液测天赋、饭后剧烈运动得阑尾炎...8月“科学”流言 你中招了吗？   \n",
       "\n",
       "                                            News Url  \\\n",
       "0  http://mp.weixin.qq.com/s?__biz=MTA1NTc0MjE0MA...   \n",
       "1  http://mp.weixin.qq.com/s?__biz=MTA5Mzc3MDQyMA...   \n",
       "2  http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...   \n",
       "3  http://mp.weixin.qq.com/s?__biz=MTA5NTIzNDE2MQ...   \n",
       "4  http://mp.weixin.qq.com/s?__biz=MTI0MDU3NDYwMQ...   \n",
       "\n",
       "                                           Image Url  \\\n",
       "0  http://mmbiz.qpic.cn/mmbiz_jpg/j27ttKHs7TlFAL5...   \n",
       "1  http://mmbiz.qpic.cn/mmbiz_jpg/0y9ibmULDTbDuCt...   \n",
       "2  http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1NxhfdGHM...   \n",
       "3  http://mmbiz.qpic.cn/mmbiz_jpg/9Ju9PZ1Nxhd8SmK...   \n",
       "4  http://mmbiz.qpic.cn/mmbiz_jpg/oq1PymRl9D7ZOQU...   \n",
       "\n",
       "                                      Report Content  label  \n",
       "0                                     [国务院没有发布过类似信息]      0  \n",
       "1                                             [？？？？]      0  \n",
       "2                                        [那个泰国人不是模特]      0  \n",
       "3                                             [造谣生事]      0  \n",
       "4  [唾液基因检测的确可以找出孩子的优势潜能，明确孩子的培养方向，科学正确引导孩子发展长处，助孩...      0  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_frame.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./data/unlabeled data\n"
     ]
    }
   ],
   "source": [
    "data_split = 'unlabeled data'\n",
    "path = os.path.join('./data/', data_split)\n",
    "file_name = os.path.join(path, 'news.csv')\n",
    "un_frame = pd.read_csv(file_name) \n",
    "un_frame['Report Content'] = un_frame['Report Content'].apply(lambda x: x.split('##'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of test data is 67748\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of test data is %d\" %(un_frame.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Image Url</th>\n",
       "      <th>News Url</th>\n",
       "      <th>Ofiicial Account Name</th>\n",
       "      <th>Report Content</th>\n",
       "      <th>Title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/hNIfUeDqtnzpxX5...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTAyNTI4NDgyMQ...</td>\n",
       "      <td>电子竞技</td>\n",
       "      <td>[所属内容不实]</td>\n",
       "      <td>直言不讳 | 为什么要包容RNG？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3qFM10...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...</td>\n",
       "      <td>腾讯大秦网</td>\n",
       "      <td>[欺诈]</td>\n",
       "      <td>31省份最低工资排行出炉：上海2420最高，陕西是……</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...</td>\n",
       "      <td>腾讯大秦网</td>\n",
       "      <td>[清谷田园并未使用青岛工厂提供的原料, 该品牌果汁的原料跟发生烂苹果事件的工厂无任何关系。,...</td>\n",
       "      <td>可怕！国产果汁潜规则曝光，2毛一斤腐烂果被加工成高端果汁！</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...</td>\n",
       "      <td>腾讯大秦网</td>\n",
       "      <td>[鱼化寨要拆了不实信息]</td>\n",
       "      <td>鱼化寨要拆了？再见了，西安“小香港”？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_jpg/2EVtnKem0SUGprk...</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTA0MzM2MTc4MQ...</td>\n",
       "      <td>不弄头发就闹心</td>\n",
       "      <td>[我是云南人，没听过这种陋习]</td>\n",
       "      <td>云南摸-奶节真实体验 场面不忍直视</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           Image Url  \\\n",
       "0  http://mmbiz.qpic.cn/mmbiz_jpg/hNIfUeDqtnzpxX5...   \n",
       "1  http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3qFM10...   \n",
       "2  http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...   \n",
       "3  http://mmbiz.qpic.cn/mmbiz_jpg/pSEjsWXoC3pia6u...   \n",
       "4  http://mmbiz.qpic.cn/mmbiz_jpg/2EVtnKem0SUGprk...   \n",
       "\n",
       "                                            News Url Ofiicial Account Name  \\\n",
       "0  http://mp.weixin.qq.com/s?__biz=MTAyNTI4NDgyMQ...                  电子竞技   \n",
       "1  http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...                 腾讯大秦网   \n",
       "2  http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...                 腾讯大秦网   \n",
       "3  http://mp.weixin.qq.com/s?__biz=MTAzMDM2MjI4MQ...                 腾讯大秦网   \n",
       "4  http://mp.weixin.qq.com/s?__biz=MTA0MzM2MTc4MQ...               不弄头发就闹心   \n",
       "\n",
       "                                      Report Content  \\\n",
       "0                                           [所属内容不实]   \n",
       "1                                               [欺诈]   \n",
       "2  [清谷田园并未使用青岛工厂提供的原料, 该品牌果汁的原料跟发生烂苹果事件的工厂无任何关系。,...   \n",
       "3                                       [鱼化寨要拆了不实信息]   \n",
       "4                                    [我是云南人，没听过这种陋习]   \n",
       "\n",
       "                           Title  \n",
       "0              直言不讳 | 为什么要包容RNG？  \n",
       "1    31省份最低工资排行出炉：上海2420最高，陕西是……  \n",
       "2  可怕！国产果汁潜规则曝光，2毛一斤腐烂果被加工成高端果汁！  \n",
       "3            鱼化寨要拆了？再见了，西安“小香港”？  \n",
       "4              云南摸-奶节真实体验 场面不忍直视  "
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "un_frame.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
